{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "collapsed": true
   },
   "source": [
    "\n",
    "Content\n",
    "\n",
    "1. Exploratory Visualization\n",
    "2. Data Cleaning\n",
    "3. Feature Engineering\n",
    "4. Modeling & Evaluation\n",
    "5. Ensemble Methods\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "%matplotlib inline\n",
    "plt.style.use('ggplot')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "e636d756-86d0-4fb0-99a2-370dcd02ca02",
    "_uuid": "8eac0d71e8be3058b0967b3017481aee8c3c7e11",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.preprocessing import RobustScaler, StandardScaler\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.pipeline import Pipeline, make_pipeline\n",
    "from scipy.stats import skew\n",
    "from sklearn.decomposition import PCA, KernelPCA\n",
    "from sklearn.preprocessing import Imputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "f1a559e7-f62c-48ea-a024-217a6f1794b1",
    "_uuid": "6b202442ecd6a67c4c15e2ebef04eb0a8ccadf07",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score, GridSearchCV, KFold\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.linear_model import Lasso\n",
    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor\n",
    "from sklearn.svm import SVR, LinearSVR\n",
    "from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge\n",
    "from sklearn.kernel_ridge import KernelRidge\n",
    "from xgboost import XGBRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "3f7082ed-a8fb-4e1e-b4ae-0a4d96f6ff06",
    "_uuid": "9534b8ab56e6eeca2e4b9a212cac0d51db0506a0",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train = pd.read_csv('../input/train.csv')\n",
    "test = pd.read_csv('../input/test.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "225eb9e4-e3a2-4772-8e8e-73c4002e8048",
    "_uuid": "6e76f6d1039092abce658da7a7d74c3f511f1eb4"
   },
   "source": [
    "**Exploratory Visualization**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "38378084-e1bf-475c-8ac5-ee95c570add6",
    "_uuid": "54018f2e772e728255adb5255a462ffe04184c40",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(15,8))\n",
    "sns.boxplot(train.YearBuilt, train.SalePrice)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "1794fd3a-6528-4d81-b12e-ca63d06145c8",
    "_uuid": "206ca1fa9a46f96f52d5ea7829d634e707e52f02",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(12,6))\n",
    "plt.scatter(x=train.GrLivArea, y=train.SalePrice)\n",
    "plt.xlabel(\"GrLivArea\", fontsize=13)\n",
    "plt.ylabel(\"SalePrice\", fontsize=13)\n",
    "plt.ylim(0,800000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "f65e0a44-aaf6-4e46-8f61-b606032da01c",
    "_uuid": "f37136e66c7a724ce1d1ff76c43035f1d6a25034",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train.drop(train[(train[\"GrLivArea\"]>4000)&(train[\"SalePrice\"]<300000)].index,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "f9a0f9f1-4b57-4d49-bfe0-af934b317fa4",
    "_uuid": "44f5cb4eca867268a83f2ce2f74ecea5a753136c",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full=pd.concat([train,test], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "925048e2-e49a-4a68-ab52-5bda9913ebf0",
    "_uuid": "640631627bdc6840747fb941a748dacad3c81172",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full.drop(['Id'],axis=1, inplace=True)\n",
    "full.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "9338635d-04f9-459e-a4bc-c4c5cd2c28b4",
    "_uuid": "cce3091eb8b39a48d6e55f9119074bf3b01cb276"
   },
   "source": [
    "**Data Cleaning**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "a6af2dec-dd5c-4e78-a03d-723a6ad451af",
    "_uuid": "f075026c5210b222b37b08d9d0fde44bd26d06c5",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "aa = full.isnull().sum()\n",
    "aa[aa>0].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "d46def0b-9e5a-474e-a86a-1aef09192f26",
    "_uuid": "688681efe8e997b7e8746db80e1062c7d8007c45",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full.groupby(['Neighborhood'])[['LotFrontage']].agg(['mean','median','count'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "55423004-2a89-4962-8b06-5c4108fa63d3",
    "_uuid": "d28db5f1b50d96c3db62290eff3215c2ba5412ac",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full[\"LotAreaCut\"] = pd.qcut(full.LotArea,10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "3908b351-bd20-4f84-9410-a12de74b3763",
    "_uuid": "742873af4dbeb6198a1c2e2f7eafeceb50c2180c",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full.groupby(['LotAreaCut'])[['LotFrontage']].agg(['mean','median','count'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "159ed003-41bb-4917-924c-efdb1037a43f",
    "_uuid": "e4476136fe446a2bd09ccade03da0a0c7c9d8d2a",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Since some combinations of LotArea and Neighborhood are not available, so we just LotAreaCut alone.\n",
    "full['LotFrontage']=full.groupby(['LotAreaCut'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))\n",
    "\n",
    "cols=[\"MasVnrArea\", \"BsmtUnfSF\", \"TotalBsmtSF\", \"GarageCars\", \"BsmtFinSF2\", \"BsmtFinSF1\", \"GarageArea\"]\n",
    "for col in cols:\n",
    "    full[col].fillna(0, inplace=True)\n",
    "\n",
    "cols1 = [\"PoolQC\" , \"MiscFeature\", \"Alley\", \"Fence\", \"FireplaceQu\", \"GarageQual\", \"GarageCond\", \"GarageFinish\", \"GarageYrBlt\", \"GarageType\", \"BsmtExposure\", \"BsmtCond\", \"BsmtQual\", \"BsmtFinType2\", \"BsmtFinType1\", \"MasVnrType\"]\n",
    "for col in cols1:\n",
    "    full[col].fillna(\"None\", inplace=True)\n",
    "\n",
    "# fill in with mode\n",
    "cols2 = [\"MSZoning\", \"BsmtFullBath\", \"BsmtHalfBath\", \"Utilities\", \"Functional\", \"Electrical\", \"KitchenQual\", \"SaleType\",\"Exterior1st\", \"Exterior2nd\"]\n",
    "for col in cols2:\n",
    "    full[col].fillna(full[col].mode()[0], inplace=True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "e36252f9-03fb-402a-9a6c-4e1da23566bd",
    "_uuid": "1071d0a8d910ce55a9d9547053e966b2f645ea28",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full.isnull().sum()[full.isnull().sum()>0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "2f8570db-b0ec-4920-92fc-00926af65f38",
    "_uuid": "238bd2b9ca554a560bcff9c79443f5b0ff3bfaa1"
   },
   "source": [
    "\n",
    "**Feature Engineering**\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "1dc2e635-6c22-4992-9540-69087d4580f0",
    "_uuid": "9cf19c5953a8d5037c64fd75552bde6fd73d6d13",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "NumStr = [\"MSSubClass\",\"BsmtFullBath\",\"BsmtHalfBath\",\"HalfBath\",\"BedroomAbvGr\",\"KitchenAbvGr\",\"MoSold\",\"YrSold\",\"YearBuilt\",\"YearRemodAdd\",\"LowQualFinSF\",\"GarageYrBlt\"]\n",
    "for col in NumStr:\n",
    "    full[col]=full[col].astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "c9576bce-e36a-4d83-aebc-40151f7fdcb4",
    "_uuid": "f14d6f6096c6b6b6c94f850288a5af1ae887ca7e",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full.groupby(['MSSubClass'])[['SalePrice']].agg(['mean','median','count'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "8ac5a238-df7b-4ee5-a377-38fdbe0b3524",
    "_uuid": "8fa6390278dd799f1e0d9bbeb202d5965ba06fcd",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def map_values():\n",
    "    full[\"oMSSubClass\"] = full.MSSubClass.map({'180':1, \n",
    "                                        '30':2, '45':2, \n",
    "                                        '190':3, '50':3, '90':3, \n",
    "                                        '85':4, '40':4, '160':4, \n",
    "                                        '70':5, '20':5, '75':5, '80':5, '150':5,\n",
    "                                        '120': 6, '60':6})\n",
    "    \n",
    "    full[\"oMSZoning\"] = full.MSZoning.map({'C (all)':1, 'RH':2, 'RM':2, 'RL':3, 'FV':4})\n",
    "    \n",
    "    full[\"oNeighborhood\"] = full.Neighborhood.map({'MeadowV':1,\n",
    "                                               'IDOTRR':2, 'BrDale':2,\n",
    "                                               'OldTown':3, 'Edwards':3, 'BrkSide':3,\n",
    "                                               'Sawyer':4, 'Blueste':4, 'SWISU':4, 'NAmes':4,\n",
    "                                               'NPkVill':5, 'Mitchel':5,\n",
    "                                               'SawyerW':6, 'Gilbert':6, 'NWAmes':6,\n",
    "                                               'Blmngtn':7, 'CollgCr':7, 'ClearCr':7, 'Crawfor':7,\n",
    "                                               'Veenker':8, 'Somerst':8, 'Timber':8,\n",
    "                                               'StoneBr':9,\n",
    "                                               'NoRidge':10, 'NridgHt':10})\n",
    "    \n",
    "    full[\"oCondition1\"] = full.Condition1.map({'Artery':1,\n",
    "                                           'Feedr':2, 'RRAe':2,\n",
    "                                           'Norm':3, 'RRAn':3,\n",
    "                                           'PosN':4, 'RRNe':4,\n",
    "                                           'PosA':5 ,'RRNn':5})\n",
    "    \n",
    "    full[\"oBldgType\"] = full.BldgType.map({'2fmCon':1, 'Duplex':1, 'Twnhs':1, '1Fam':2, 'TwnhsE':2})\n",
    "    \n",
    "    full[\"oHouseStyle\"] = full.HouseStyle.map({'1.5Unf':1, \n",
    "                                           '1.5Fin':2, '2.5Unf':2, 'SFoyer':2, \n",
    "                                           '1Story':3, 'SLvl':3,\n",
    "                                           '2Story':4, '2.5Fin':4})\n",
    "    \n",
    "    full[\"oExterior1st\"] = full.Exterior1st.map({'BrkComm':1,\n",
    "                                             'AsphShn':2, 'CBlock':2, 'AsbShng':2,\n",
    "                                             'WdShing':3, 'Wd Sdng':3, 'MetalSd':3, 'Stucco':3, 'HdBoard':3,\n",
    "                                             'BrkFace':4, 'Plywood':4,\n",
    "                                             'VinylSd':5,\n",
    "                                             'CemntBd':6,\n",
    "                                             'Stone':7, 'ImStucc':7})\n",
    "    \n",
    "    full[\"oMasVnrType\"] = full.MasVnrType.map({'BrkCmn':1, 'None':1, 'BrkFace':2, 'Stone':3})\n",
    "    \n",
    "    full[\"oExterQual\"] = full.ExterQual.map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})\n",
    "    \n",
    "    full[\"oFoundation\"] = full.Foundation.map({'Slab':1, \n",
    "                                           'BrkTil':2, 'CBlock':2, 'Stone':2,\n",
    "                                           'Wood':3, 'PConc':4})\n",
    "    \n",
    "    full[\"oBsmtQual\"] = full.BsmtQual.map({'Fa':2, 'None':1, 'TA':3, 'Gd':4, 'Ex':5})\n",
    "    \n",
    "    full[\"oBsmtExposure\"] = full.BsmtExposure.map({'None':1, 'No':2, 'Av':3, 'Mn':3, 'Gd':4})\n",
    "    \n",
    "    full[\"oHeating\"] = full.Heating.map({'Floor':1, 'Grav':1, 'Wall':2, 'OthW':3, 'GasW':4, 'GasA':5})\n",
    "    \n",
    "    full[\"oHeatingQC\"] = full.HeatingQC.map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})\n",
    "    \n",
    "    full[\"oKitchenQual\"] = full.KitchenQual.map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})\n",
    "    \n",
    "    full[\"oFunctional\"] = full.Functional.map({'Maj2':1, 'Maj1':2, 'Min1':2, 'Min2':2, 'Mod':2, 'Sev':2, 'Typ':3})\n",
    "    \n",
    "    full[\"oFireplaceQu\"] = full.FireplaceQu.map({'None':1, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})\n",
    "    \n",
    "    full[\"oGarageType\"] = full.GarageType.map({'CarPort':1, 'None':1,\n",
    "                                           'Detchd':2,\n",
    "                                           '2Types':3, 'Basment':3,\n",
    "                                           'Attchd':4, 'BuiltIn':5})\n",
    "    \n",
    "    full[\"oGarageFinish\"] = full.GarageFinish.map({'None':1, 'Unf':2, 'RFn':3, 'Fin':4})\n",
    "    \n",
    "    full[\"oPavedDrive\"] = full.PavedDrive.map({'N':1, 'P':2, 'Y':3})\n",
    "    \n",
    "    full[\"oSaleType\"] = full.SaleType.map({'COD':1, 'ConLD':1, 'ConLI':1, 'ConLw':1, 'Oth':1, 'WD':1,\n",
    "                                       'CWD':2, 'Con':3, 'New':3})\n",
    "    \n",
    "    full[\"oSaleCondition\"] = full.SaleCondition.map({'AdjLand':1, 'Abnorml':2, 'Alloca':2, 'Family':2, 'Normal':3, 'Partial':4})            \n",
    "                \n",
    "                        \n",
    "                        \n",
    "    \n",
    "    return \"Done!\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "483c1768-6a3c-4f01-894c-cb0bb1be942e",
    "_uuid": "837ef77fb5545b924306a4a2b692505dd73b19d1",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "map_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "4ed390ae-14c5-4762-bbc9-ff7b4e2090c8",
    "_uuid": "03a905d6d35ae4f58f9c22d27a63063d1b94834f",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# drop two unwanted columns\n",
    "full.drop(\"LotAreaCut\",axis=1,inplace=True)\n",
    "full.drop(['SalePrice'],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "18394a36-e959-476d-886d-adc80431467c",
    "_uuid": "3cf53b20d14778ba6f9ebc6c3fd44d66a5482f15"
   },
   "source": [
    "\n",
    "**Pipeline**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "e544bce3-dc98-499a-a028-b267955ed6fc",
    "_uuid": "7d1460253c451f4706a25af5e862c205bdc71cc5",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class labelenc(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self):\n",
    "        pass\n",
    "    \n",
    "    def fit(self,X,y=None):\n",
    "        return self\n",
    "    \n",
    "    def transform(self,X):\n",
    "        lab=LabelEncoder()\n",
    "        X[\"YearBuilt\"] = lab.fit_transform(X[\"YearBuilt\"])\n",
    "        X[\"YearRemodAdd\"] = lab.fit_transform(X[\"YearRemodAdd\"])\n",
    "        X[\"GarageYrBlt\"] = lab.fit_transform(X[\"GarageYrBlt\"])\n",
    "        return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "29df1979-e0c8-47ca-81d9-0d2015144f07",
    "_uuid": "d44209f0a102e2eb6016e3a320619a7a3f0420c4",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class skew_dummies(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self,skew=0.5):\n",
    "        self.skew = skew\n",
    "    \n",
    "    def fit(self,X,y=None):\n",
    "        return self\n",
    "    \n",
    "    def transform(self,X):\n",
    "        X_numeric=X.select_dtypes(exclude=[\"object\"])\n",
    "        skewness = X_numeric.apply(lambda x: skew(x))\n",
    "        skewness_features = skewness[abs(skewness) >= self.skew].index\n",
    "        X[skewness_features] = np.log1p(X[skewness_features])\n",
    "        X = pd.get_dummies(X)\n",
    "        return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b2386cbe-55dd-43a6-86c8-1f13b2c1ef5b",
    "_uuid": "f821c1d04f64343f1b185e6e67aff5b05af4a7bf",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# build pipeline\n",
    "pipe = Pipeline([\n",
    "    ('labenc', labelenc()),\n",
    "    ('skew_dummies', skew_dummies(skew=1)),\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "98287404-b6c5-4ec4-8c75-8380fd5fbf0f",
    "_uuid": "b89cf4f6b33f7cbee64af1ffe02e3be83d388cf4",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# save the original data for later use\n",
    "full2 = full.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b6ed51e7-d27c-40df-a23c-90ea29f22ea9",
    "_uuid": "955bc13b1a5213dcf3e8595d4c02129e7ed107a4",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_pipe = pipe.fit_transform(full2)\n",
    "\n",
    "data_pipe.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "7d8e2f96-17c1-4541-ad5b-c14b24ead13f",
    "_uuid": "0107992543b68a7d412d6c1f986ff4aebf2fb99d",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_pipe.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "fce619c5-c4cf-42db-ab54-430ed3349d9b",
    "_uuid": "dd1222aee11200e338a948695825ac16aee2089a",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "scaler = RobustScaler()\n",
    "\n",
    "n_train=train.shape[0]\n",
    "\n",
    "X = data_pipe[:n_train]\n",
    "test_X = data_pipe[n_train:]\n",
    "y= train.SalePrice\n",
    "\n",
    "X_scaled = scaler.fit(X).transform(X)\n",
    "y_log = np.log(train.SalePrice)\n",
    "test_X_scaled = scaler.transform(test_X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "472bbcd3-e36a-4daf-b926-b1d3fd1cb7cd",
    "_uuid": "50b376e135d080cb84ab518cb8110c11ce465a68"
   },
   "source": [
    "**\n",
    "Feature Selection**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "f18689c3-44b0-4c61-b87e-c9223d393fff",
    "_uuid": "b792d676beadc2adcc199ab4e5e7a5d211f75efe",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lasso=Lasso(alpha=0.001)\n",
    "lasso.fit(X_scaled,y_log)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "0f73d342-19c1-47bb-b1b0-f48ef86654d3",
    "_uuid": "4359ee5943685ecf92d3379f965c99d526fbc7eb",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "FI_lasso = pd.DataFrame({\"Feature Importance\":lasso.coef_}, index=data_pipe.columns)\n",
    "\n",
    "FI_lasso.sort_values(\"Feature Importance\",ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b9657224-2e7c-4dfb-8c8c-50e4acb91da5",
    "_uuid": "0813121748c0b3401742c78ac910a4a7f0b82b04",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "FI_lasso[FI_lasso[\"Feature Importance\"]!=0].sort_values(\"Feature Importance\").plot(kind=\"barh\",figsize=(15,25))\n",
    "plt.xticks(rotation=90)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "e204170c-c383-4930-a627-bf8326031c4f",
    "_uuid": "c13fbcf8ff23145a57211f38888c28c0f04117d8",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class add_feature(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self,additional=1):\n",
    "        self.additional = additional\n",
    "    \n",
    "    def fit(self,X,y=None):\n",
    "        return self\n",
    "    \n",
    "    def transform(self,X):\n",
    "        if self.additional==1:\n",
    "            X[\"TotalHouse\"] = X[\"TotalBsmtSF\"] + X[\"1stFlrSF\"] + X[\"2ndFlrSF\"]   \n",
    "            X[\"TotalArea\"] = X[\"TotalBsmtSF\"] + X[\"1stFlrSF\"] + X[\"2ndFlrSF\"] + X[\"GarageArea\"]\n",
    "            \n",
    "        else:\n",
    "            X[\"TotalHouse\"] = X[\"TotalBsmtSF\"] + X[\"1stFlrSF\"] + X[\"2ndFlrSF\"]   \n",
    "            X[\"TotalArea\"] = X[\"TotalBsmtSF\"] + X[\"1stFlrSF\"] + X[\"2ndFlrSF\"] + X[\"GarageArea\"]\n",
    "            \n",
    "            X[\"+_TotalHouse_OverallQual\"] = X[\"TotalHouse\"] * X[\"OverallQual\"]\n",
    "            X[\"+_GrLivArea_OverallQual\"] = X[\"GrLivArea\"] * X[\"OverallQual\"]\n",
    "            X[\"+_oMSZoning_TotalHouse\"] = X[\"oMSZoning\"] * X[\"TotalHouse\"]\n",
    "            X[\"+_oMSZoning_OverallQual\"] = X[\"oMSZoning\"] + X[\"OverallQual\"]\n",
    "            X[\"+_oMSZoning_YearBuilt\"] = X[\"oMSZoning\"] + X[\"YearBuilt\"]\n",
    "            X[\"+_oNeighborhood_TotalHouse\"] = X[\"oNeighborhood\"] * X[\"TotalHouse\"]\n",
    "            X[\"+_oNeighborhood_OverallQual\"] = X[\"oNeighborhood\"] + X[\"OverallQual\"]\n",
    "            X[\"+_oNeighborhood_YearBuilt\"] = X[\"oNeighborhood\"] + X[\"YearBuilt\"]\n",
    "            X[\"+_BsmtFinSF1_OverallQual\"] = X[\"BsmtFinSF1\"] * X[\"OverallQual\"]\n",
    "            \n",
    "            X[\"-_oFunctional_TotalHouse\"] = X[\"oFunctional\"] * X[\"TotalHouse\"]\n",
    "            X[\"-_oFunctional_OverallQual\"] = X[\"oFunctional\"] + X[\"OverallQual\"]\n",
    "            X[\"-_LotArea_OverallQual\"] = X[\"LotArea\"] * X[\"OverallQual\"]\n",
    "            X[\"-_TotalHouse_LotArea\"] = X[\"TotalHouse\"] + X[\"LotArea\"]\n",
    "            X[\"-_oCondition1_TotalHouse\"] = X[\"oCondition1\"] * X[\"TotalHouse\"]\n",
    "            X[\"-_oCondition1_OverallQual\"] = X[\"oCondition1\"] + X[\"OverallQual\"]\n",
    "            \n",
    "           \n",
    "            X[\"Bsmt\"] = X[\"BsmtFinSF1\"] + X[\"BsmtFinSF2\"] + X[\"BsmtUnfSF\"]\n",
    "            X[\"Rooms\"] = X[\"FullBath\"]+X[\"TotRmsAbvGrd\"]\n",
    "            X[\"PorchArea\"] = X[\"OpenPorchSF\"]+X[\"EnclosedPorch\"]+X[\"3SsnPorch\"]+X[\"ScreenPorch\"]\n",
    "            X[\"TotalPlace\"] = X[\"TotalBsmtSF\"] + X[\"1stFlrSF\"] + X[\"2ndFlrSF\"] + X[\"GarageArea\"] + X[\"OpenPorchSF\"]+X[\"EnclosedPorch\"]+X[\"3SsnPorch\"]+X[\"ScreenPorch\"]\n",
    "\n",
    "    \n",
    "            return X\n",
    "\n",
    "   \n",
    "pipe = Pipeline([\n",
    "    ('labenc', labelenc()),\n",
    "    ('add_feature', add_feature(additional=2)),\n",
    "    ('skew_dummies', skew_dummies(skew=1)),\n",
    "    ])\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "365795ee-56f5-4ba5-a538-ae6049d6e495",
    "_uuid": "5d7b9e484e056d4c23c2f213af2d8616b06eefb7"
   },
   "source": [
    "PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "35ffb0aa-7306-4a0c-90fb-df9f3ac630eb",
    "_uuid": "558039d7893f08d7b475b30188b8172cf3dce506",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full_pipe = pipe.fit_transform(full)\n",
    "\n",
    "full_pipe.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "6b6f5013-9a40-4a50-b349-ec7df8597ea8",
    "_uuid": "e233624073363ae297d650eaf194c70876af5336",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_train=train.shape[0]\n",
    "X = full_pipe[:n_train]\n",
    "test_X = full_pipe[n_train:]\n",
    "y= train.SalePrice\n",
    "\n",
    "X_scaled = scaler.fit(X).transform(X)\n",
    "y_log = np.log(train.SalePrice)\n",
    "test_X_scaled = scaler.transform(test_X)\n",
    "\n",
    "pca = PCA(n_components=410)\n",
    "\n",
    "X_scaled=pca.fit_transform(X_scaled)\n",
    "test_X_scaled = pca.transform(test_X_scaled)\n",
    "\n",
    "X_scaled.shape, test_X_scaled.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "68130de6-0d64-45e4-bce6-10d5441b438b",
    "_uuid": "4fbee2f5a292a1ee49524d317a38d9432dd44e7c"
   },
   "source": [
    "**Modeling & Evaluation**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "a6c90505-a3c8-4ec0-9b67-ecc4b2a2468b",
    "_uuid": "3e6f26746ca4dbc51af27981762a0f78b2ce245b",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# define cross validation strategy\n",
    "def rmse_cv(model,X,y):\n",
    "    rmse = np.sqrt(-cross_val_score(model, X, y, scoring=\"neg_mean_squared_error\", cv=5))\n",
    "    return rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "7b2176f7-5a84-448c-b809-17595d2434dc",
    "_uuid": "51207de72fc540a3c8da556bb708649b5340e02c"
   },
   "source": [
    "\n",
    "\n",
    "    We choose 13 models and use 5-folds cross-calidation to evaluate these models.\n",
    "\n",
    "Models include:\n",
    "\n",
    "    LinearRegression\n",
    "    Ridge\n",
    "    Lasso\n",
    "    Random Forrest\n",
    "    Gradient Boosting Tree\n",
    "    Support Vector Regression\n",
    "    Linear Support Vector Regression\n",
    "    ElasticNet\n",
    "    Stochastic Gradient Descent\n",
    "    BayesianRidge\n",
    "    KernelRidge\n",
    "    ExtraTreesRegressor\n",
    "    XgBoost\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "1ca0c9a3-afbb-470b-ac74-7f4c8488a260",
    "_uuid": "cbab9a227cf7006c9893c036431253ca8a9d350c",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(),\n",
    "          ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(max_iter=1000,tol=1e-3),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),\n",
    "          ExtraTreesRegressor(),XGBRegressor()]\n",
    "\n",
    "names = [\"LR\", \"Ridge\", \"Lasso\", \"RF\", \"GBR\", \"SVR\", \"LinSVR\", \"Ela\",\"SGD\",\"Bay\",\"Ker\",\"Extra\",\"Xgb\"]\n",
    "for name, model in zip(names, models):\n",
    "    score = rmse_cv(model, X_scaled, y_log)\n",
    "    print(\"{}: {:.6f}, {:.4f}\".format(name,score.mean(),score.std()))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "0c905aa3-43e2-4560-a55c-618412bc10aa",
    "_uuid": "3d43f90f3a52af8d139782e3f3a02fd821b986da",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class grid():\n",
    "    def __init__(self,model):\n",
    "        self.model = model\n",
    "    \n",
    "    def grid_get(self,X,y,param_grid):\n",
    "        grid_search = GridSearchCV(self.model,param_grid,cv=5, scoring=\"neg_mean_squared_error\")\n",
    "        grid_search.fit(X,y)\n",
    "        print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))\n",
    "        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])\n",
    "        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "0a739cc8-f726-4b4c-8c4b-2272f5cfdb98",
    "_uuid": "77c92d14c586edc0a2f88079648892ccd782088a"
   },
   "source": [
    "Lasso"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b03ea4dc-314b-4fee-a425-0a24e2fcabf9",
    "_uuid": "96a08de433103c5285d2791c62bf35ab17b79d0b",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "grid(Lasso()).grid_get(X_scaled,y_log,{'alpha': [0.0004,0.0005,0.0007,0.0009],'max_iter':[10000]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "74de2800-b304-4c81-8261-6afd05193a33",
    "_uuid": "8d507566e8de0629ca5a3f7b7923c15e2ceb0b60",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "grid(Ridge()).grid_get(X_scaled,y_log,{'alpha':[35,40,45,50,55,60,65,70,80,90]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "59bc1512-0870-41c4-9377-3d9d51ef32aa",
    "_uuid": "59fd19e967e38009414131b002b6d0cb9a1e9bf4",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "grid(SVR()).grid_get(X_scaled,y_log,{'C':[11,13,15],'kernel':[\"rbf\"],\"gamma\":[0.0003,0.0004],\"epsilon\":[0.008,0.009]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "3d0491ac-e7eb-43f2-94f8-383d27702548",
    "_uuid": "2622a5f37dcda3faa03ba18fcaeacb71d4cccfb7",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "param_grid={'alpha':[0.2,0.3,0.4], 'kernel':[\"polynomial\"], 'degree':[3],'coef0':[0.8,1]}\n",
    "grid(KernelRidge()).grid_get(X_scaled,y_log,param_grid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "1015e0b4-a238-42bc-97c0-1321f4aafc94",
    "_uuid": "72994927b09294a79a18be100e78ca0ef93e9e2f",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "grid(ElasticNet()).grid_get(X_scaled,y_log,{'alpha':[0.0008,0.004,0.005],'l1_ratio':[0.08,0.1,0.3],'max_iter':[10000]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "d7639a6a-8325-4f63-b4fe-8ed3c7671737",
    "_uuid": "dc02c18b3d81633500af072773dd0607ac83f5cd",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class AverageWeight(BaseEstimator, RegressorMixin):\n",
    "    def __init__(self,mod,weight):\n",
    "        self.mod = mod\n",
    "        self.weight = weight\n",
    "        \n",
    "    def fit(self,X,y):\n",
    "        self.models_ = [clone(x) for x in self.mod]\n",
    "        for model in self.models_:\n",
    "            model.fit(X,y)\n",
    "        return self\n",
    "    \n",
    "    def predict(self,X):\n",
    "        w = list()\n",
    "        pred = np.array([model.predict(X) for model in self.models_])\n",
    "        # for every data point, single model prediction times weight, then add them together\n",
    "        for data in range(pred.shape[1]):\n",
    "            single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]\n",
    "            w.append(np.sum(single))\n",
    "        return w\n",
    "\n",
    "lasso = Lasso(alpha=0.0005,max_iter=10000)\n",
    "ridge = Ridge(alpha=60)\n",
    "svr = SVR(gamma= 0.0004,kernel='rbf',C=13,epsilon=0.009)\n",
    "ker = KernelRidge(alpha=0.2 ,kernel='polynomial',degree=3 , coef0=0.8)\n",
    "ela = ElasticNet(alpha=0.005,l1_ratio=0.08,max_iter=10000)\n",
    "bay = BayesianRidge()\n",
    "\n",
    "# assign weights based on their gridsearch score\n",
    "w1 = 0.02\n",
    "w2 = 0.2\n",
    "w3 = 0.25\n",
    "w4 = 0.3\n",
    "w5 = 0.03\n",
    "w6 = 0.2\n",
    "\n",
    "weight_avg = AverageWeight(mod = [lasso,ridge,svr,ker,ela,bay],weight=[w1,w2,w3,w4,w5,w6])\n",
    "\n",
    "score = rmse_cv(weight_avg,X_scaled,y_log)\n",
    "print(score.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "bc206ea2-7c35-4905-867c-b8ccdfec1449",
    "_uuid": "c286e4f395488b614f5ea9b6c14a7e15ac9eca2a",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class stacking(BaseEstimator, RegressorMixin, TransformerMixin):\n",
    "    def __init__(self,mod,meta_model):\n",
    "        self.mod = mod\n",
    "        self.meta_model = meta_model\n",
    "        self.kf = KFold(n_splits=5, random_state=42, shuffle=True)\n",
    "        \n",
    "    def fit(self,X,y):\n",
    "        self.saved_model = [list() for i in self.mod]\n",
    "        oof_train = np.zeros((X.shape[0], len(self.mod)))\n",
    "        \n",
    "        for i,model in enumerate(self.mod):\n",
    "            for train_index, val_index in self.kf.split(X,y):\n",
    "                renew_model = clone(model)\n",
    "                renew_model.fit(X[train_index], y[train_index])\n",
    "                self.saved_model[i].append(renew_model)\n",
    "                oof_train[val_index,i] = renew_model.predict(X[val_index])\n",
    "        \n",
    "        self.meta_model.fit(oof_train,y)\n",
    "        return self\n",
    "    \n",
    "    def predict(self,X):\n",
    "        whole_test = np.column_stack([np.column_stack(model.predict(X) for model in single_model).mean(axis=1) \n",
    "                                      for single_model in self.saved_model]) \n",
    "        return self.meta_model.predict(whole_test)\n",
    "    \n",
    "    def get_oof(self,X,y,test_X):\n",
    "        oof = np.zeros((X.shape[0],len(self.mod)))\n",
    "        test_single = np.zeros((test_X.shape[0],5))\n",
    "        test_mean = np.zeros((test_X.shape[0],len(self.mod)))\n",
    "        for i,model in enumerate(self.mod):\n",
    "            for j, (train_index,val_index) in enumerate(self.kf.split(X,y)):\n",
    "                clone_model = clone(model)\n",
    "                clone_model.fit(X[train_index],y[train_index])\n",
    "                oof[val_index,i] = clone_model.predict(X[val_index])\n",
    "                test_single[:,j] = clone_model.predict(test_X)\n",
    "            test_mean[:,i] = test_single.mean(axis=1)\n",
    "        return oof, test_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "d41c658d-a516-4f6a-bac3-10a24789c350",
    "_uuid": "7ee9879eae5a95afd188cfaa3b85497bb5e64933",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# must do imputer first, otherwise stacking won't work, and i don't know why.\n",
    "a = Imputer().fit_transform(X_scaled)\n",
    "b = Imputer().fit_transform(y_log.values.reshape(-1,1)).ravel()\n",
    "\n",
    "stack_model = stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)\n",
    "\n",
    "score = rmse_cv(stack_model,a,b)\n",
    "print(score.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "_cell_guid": "556f9fc7-725e-454f-802a-c38dff8098bd",
    "_uuid": "0b8569b05a5dc7b171b05d898e00fb2b83a844fb"
   },
   "source": [
    "\n",
    "Submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "381b566d-e81e-4be8-a3c4-67fb621899fb",
    "_uuid": "eac08f8563cef36fc40c1971786c4ba4f31db42b",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# This is the final model I use\n",
    "stack_model = stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)\n",
    "\n",
    "stack_model.fit(a,b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "858195af-635d-48e7-882a-6d061678ccd1",
    "_uuid": "30de502de09ee5157c26d9381872bd03d5e69e0c",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pred = np.exp(stack_model.predict(test_X_scaled))\n",
    "\n",
    "result=pd.DataFrame({'Id':test.Id, 'SalePrice':pred})\n",
    "result.to_csv(\"submission.csv\",index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
